Notebook


## Import packages
library("dplyr") 
library("Seurat") # scRNA-seq analysis
library("patchwork")
if (!"SeuratWrappers" %in% installed.packages()) remotes::install_github("satijalab/seurat-wrappers", "seurat5")
library("SeuratWrappers")
if (!"batchelor" %in% installed.packages()) BiocManager::install("batchelor")
if ((!"matrixStats" %in% installed.packages()) | (packageVersion("matrixStats")>"1.1.0")) {
  remotes::install_version("matrixStats", version="1.1.0")
} 
if (!"ComplexHeatmap" %in% installed.packages()) remotes::install_github("jokergoo/ComplexHeatmap")
if (!"zellkonverter" %in% installed.packages()) BiocManager::install("zellkonverter")






(1) Import datasets

(7 min)


# Import data
data.dir <- "../data"
seu <- readRDS(file = file.path(data.dir, "jurkat.rds"))


Downsample dataset


You may want to down sample this data set depending on the amount of RAM memory you have. The jurkat data set has 8,664 cells.

## Downsample data set
downsample <- TRUE # replace to FALSE in case you don't want to down sample
prop.down <- 0.4 # proportion of cells to down sample per batch: 40% of the cells
if (downsample) {
  no.cells.batch <- ceiling(table(seu$batch) * 0.4) # CTRL = 1310 and STIM = 1491 
  cell.idx.batch <- split(x = colnames(seu), f = seu$batch) # split into a list the cell names per batch
  cell.idx.batch.down <- lapply(X = setNames(names(cell.idx.batch), names(cell.idx.batch)), FUN = function(x) {
    set.seed(123)
    sample(x = cell.idx.batch[[x]], size = no.cells.batch[[x]], replace = FALSE)
  }) # downsample each batch cell names 
  cell.idx.downsample <- do.call(c, cell.idx.batch.down) # join cell name labels from the two batches into one character vector
  seu <- subset(seu, cells = cell.idx.downsample)
}
gc()
##            used  (Mb) gc trigger  (Mb) max used  (Mb)
## Ncells  3293228 175.9    4808767 256.9  4808767 256.9
## Vcells 23036169 175.8   81939233 625.2 84635607 645.8






(2) Assess batch effect

(7 min)


Joint dimred


## Joint analysis

# Standard Seurat upstream workflow
seu <- NormalizeData(seu)
seu <- FindVariableFeatures(seu)
seu <- ScaleData(seu)
seu <- RunPCA(seu)
seu <- RunUMAP(seu, dims = 1:30, reduction = "pca", reduction.name = "umap.unintegrated")
## Plot jointly dimreds
pca.unint <- DimPlot(seu, reduction = "pca", group.by = "batch")
umap.unint <- DimPlot(seu, reduction = "umap.unintegrated", group.by = "batch")
pca.unint + umap.unint


Celltype markers


## Joint celltype markers

# List of jurkat and T293 cell lines
markers.plot <- list(
  "jurkat" = "CD3D", 
  "t293" = "XIST"
)

# Plot
jurkat.markers.unint.plot <- FeaturePlot(seu, features = markers.plot$jurkat, split.by = "batch", 
                                       max.cutoff = 3, cols = c("grey", "red"), 
                                       reduction = "umap.unintegrated", ncol = 4, 
                                       pt.size = 0.5)
t293.markers.unint.plot <- FeaturePlot(seu, features = markers.plot$t293, split.by = "batch", 
                                           max.cutoff = 3, cols = c("grey", "red"), 
                                           reduction = "umap.unintegrated", ncol = 4, 
                                           pt.size = 0.5)
## Plot jointly celltype markers

# Print 
jurkat.markers.unint.plot 

t293.markers.unint.plot


Manual cell annotation


## Independent sample analysis

# Split Seurat object into two batch on 'batch' label identity
seu.list <- SplitObject(object = seu, split.by = "batch")

# Standard Seurat upstream workflow
seu.list <- lapply(X = seu.list, FUN = function(x) {
  x <- NormalizeData(x)
  x <- FindVariableFeatures(x)
  x <- ScaleData(x)
  x <- RunPCA(x)
  x <- FindNeighbors(x, dims = 1:15, reduction = "pca")
  x <- FindClusters(x, resolution = 0.1, cluster.name = "unintegrated_clusters")
  x <- RunUMAP(x, dims = 1:15, reduction = "pca", reduction.name = "umap.unintegrated")
})
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 1142
## Number of edges: 45157
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9000
## Number of communities: 1
## Elapsed time: 0 seconds
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 1303
## Number of edges: 48196
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9000
## Number of communities: 1
## Elapsed time: 0 seconds
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 1022
## Number of edges: 40793
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9496
## Number of communities: 2
## Elapsed time: 0 seconds
## Plot independent sample analysis clusters
umap.ind.samp.unint <- lapply(X = seu.list, FUN = function(x) {
  DimPlot(x, reduction = "umap.unintegrated", group.by = "unintegrated_clusters", pt.size = 0.5)
})
umap.ind.samp.unint$t293 + umap.ind.samp.unint$jurkat + umap.ind.samp.unint$`jurkat_t293_50:50`






(3) Integrate datasets

(7 min)


## Perform integration

# Split layers for integration
seu[["RNA"]] <- split(x = seu[["RNA"]], f = seu$batch)

# Standard workflow
seu <- NormalizeData(seu)
seu <- FindVariableFeatures(seu)
seu <- ScaleData(seu)
seu <- RunPCA(seu)

# Integrate layers
int.methods <- c("CCA" = "CCAIntegration", "RPCA" = "RPCAIntegration", 
                 "Harmony" = "HarmonyIntegration", "FastMNN" = "FastMNNIntegration", 
                 "scVI" = "scVIIntegration")

for (m in names(int.methods)[1:4]) {
  cat("\nRunning integration method", m, "...\n")
  int.dimred <- paste0("integrated.", m)
  umap.dimred <- paste0("umap.", m)
  # Integration
  if (m=="scVI") {
          seu <- IntegrateLayers(object = seu, method = get(eval(substitute(int.methods[m]))), 
                         orig.reduction = "pca", 
                         new.reduction = int.dimred,
                         conda_env = "/home/aggs/miniconda3/envs/scvi-env", # substitute this by your installation 
                         verbose = TRUE)
  } else {
      seu <- IntegrateLayers(object = seu, method = get(eval(substitute(int.methods[m]))), 
                         orig.reduction = "pca", 
                         new.reduction = int.dimred,
                         verbose = TRUE)
  }

}
## 
## Running integration method CCA ...
## 
## Running integration method RPCA ...
## 
## Running integration method Harmony ...
## 
## Running integration method FastMNN ...
# Re-join layers after integration
seu[["RNA"]] <- JoinLayers(seu[["RNA"]])

# Run UMAP for every integration method
int.umaps.plots <- list()
for (m in names(int.methods)[1:4]) {
  cat("\nRunning UMAP for", m, "integrated result...\n")
  int.dimred <- paste0("integrated.", m)
  umap.dimred <- paste0("umap.", m)
  seu <- RunUMAP(seu, dims = 1:30, reduction = int.dimred, reduction.name = umap.dimred)
  int.umaps.plots[[m]] <-  DimPlot(object = seu, reduction = umap.dimred, group.by = c("batch", "cell_type"), 
                                   combine = FALSE, label.size = 2)
}
## 
## Running UMAP for CCA integrated result...
## 
## Running UMAP for RPCA integrated result...
## 
## Running UMAP for Harmony integrated result...
## 
## Running UMAP for FastMNN integrated result...






(4) Assess integration

(7 min)


## Assess integration by printing the plots using the "batch" and "cell_type" (ground-truth) labels
wrap_plots(c(int.umaps.plots$CCA, int.umaps.plots$RPCA, int.umaps.plots$Harmony, int.umaps.plots$FastMNN),
           ncol = 2, byrow = TRUE)






R packages used and respective versions


## R packages and versions used in these analyses
sessionInfo()
## R version 4.1.0 (2021-05-18)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 20.04.4 LTS
## 
## Matrix products: default
## BLAS:   /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.9.0
## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.9.0
## 
## locale:
##  [1] LC_CTYPE=en_GB.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=fi_FI.UTF-8        LC_COLLATE=en_GB.UTF-8    
##  [5] LC_MONETARY=fi_FI.UTF-8    LC_MESSAGES=en_GB.UTF-8   
##  [7] LC_PAPER=fi_FI.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=fi_FI.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
## [1] SeuratWrappers_0.3.2 patchwork_1.1.2      Seurat_5.1.0        
## [4] SeuratObject_5.0.2   sp_1.5-1             dplyr_1.1.4         
## 
## loaded via a namespace (and not attached):
##   [1] spam_2.9-1                  plyr_1.8.7                 
##   [3] igraph_1.3.5                lazyeval_0.2.2             
##   [5] splines_4.1.0               RcppHNSW_0.4.1             
##   [7] BiocParallel_1.28.3         listenv_0.8.0              
##   [9] scattermore_1.2             GenomeInfoDb_1.30.1        
##  [11] ggplot2_3.4.0               digest_0.6.30              
##  [13] htmltools_0.5.5             fansi_1.0.3                
##  [15] magrittr_2.0.3              ScaledMatrix_1.2.0         
##  [17] tensor_1.5                  cluster_2.1.4              
##  [19] ROCR_1.0-11                 remotes_2.4.2              
##  [21] globals_0.16.2              matrixStats_1.1.0          
##  [23] R.utils_2.12.1              spatstat.sparse_3.0-0      
##  [25] colorspace_2.0-3            ggrepel_0.9.2              
##  [27] xfun_0.43                   RCurl_1.98-1.9             
##  [29] jsonlite_1.8.4              progressr_0.11.0           
##  [31] spatstat.data_3.0-0         survival_3.4-0             
##  [33] zoo_1.8-11                  glue_1.6.2                 
##  [35] polyclip_1.10-4             gtable_0.3.1               
##  [37] zlibbioc_1.40.0             XVector_0.34.0             
##  [39] leiden_0.4.3                DelayedArray_0.20.0        
##  [41] BiocSingular_1.10.0         SingleCellExperiment_1.16.0
##  [43] future.apply_1.10.0         BiocGenerics_0.40.0        
##  [45] abind_1.4-5                 scales_1.3.0               
##  [47] spatstat.random_3.0-1       miniUI_0.1.1.1             
##  [49] Rcpp_1.0.9                  viridisLite_0.4.1          
##  [51] xtable_1.8-4                reticulate_1.26            
##  [53] rsvd_1.0.5                  dotCall64_1.0-2            
##  [55] ResidualMatrix_1.4.0        stats4_4.1.0               
##  [57] htmlwidgets_1.6.2           httr_1.4.5                 
##  [59] RColorBrewer_1.1-3          ellipsis_0.3.2             
##  [61] ica_1.0-3                   scuttle_1.4.0              
##  [63] pkgconfig_2.0.3             R.methodsS3_1.8.2          
##  [65] farver_2.1.1                sass_0.4.2                 
##  [67] uwot_0.1.16                 deldir_1.0-6               
##  [69] utf8_1.2.2                  tidyselect_1.2.0           
##  [71] labeling_0.4.2              rlang_1.1.2                
##  [73] reshape2_1.4.4              later_1.3.0                
##  [75] munsell_0.5.0               tools_4.1.0                
##  [77] cachem_1.0.6                cli_3.6.1                  
##  [79] generics_0.1.3              ggridges_0.5.4             
##  [81] batchelor_1.10.0            evaluate_0.17              
##  [83] stringr_1.5.0               fastmap_1.1.0              
##  [85] yaml_2.3.6                  goftest_1.2-3              
##  [87] RhpcBLASctl_0.21-247.1      knitr_1.46                 
##  [89] fitdistrplus_1.1-8          purrr_1.0.1                
##  [91] RANN_2.6.1                  sparseMatrixStats_1.6.0    
##  [93] pbapply_1.6-0               future_1.29.0              
##  [95] nlme_3.1-160                mime_0.12                  
##  [97] R.oo_1.25.0                 compiler_4.1.0             
##  [99] plotly_4.10.1               png_0.1-7                  
## [101] spatstat.utils_3.0-1        tibble_3.2.1               
## [103] bslib_0.4.0                 stringi_1.7.8              
## [105] highr_0.9                   RSpectra_0.16-1            
## [107] lattice_0.20-45             Matrix_1.6-5               
## [109] vctrs_0.6.5                 pillar_1.9.0               
## [111] lifecycle_1.0.3             BiocManager_1.30.19        
## [113] spatstat.geom_3.0-3         lmtest_0.9-40              
## [115] jquerylib_0.1.4             RcppAnnoy_0.0.20           
## [117] BiocNeighbors_1.12.0        bitops_1.0-7               
## [119] data.table_1.14.10          cowplot_1.1.1              
## [121] irlba_2.3.5.1               GenomicRanges_1.46.1       
## [123] httpuv_1.6.6                R6_2.5.1                   
## [125] promises_1.2.0.1            KernSmooth_2.23-20         
## [127] gridExtra_2.3               IRanges_2.28.0             
## [129] parallelly_1.32.1           codetools_0.2-18           
## [131] fastDummies_1.7.3           MASS_7.3-60                
## [133] SummarizedExperiment_1.24.0 withr_2.5.0                
## [135] sctransform_0.4.1           GenomeInfoDbData_1.2.7     
## [137] harmony_1.2.0               S4Vectors_0.32.4           
## [139] parallel_4.1.0              beachmat_2.10.0            
## [141] grid_4.1.0                  tidyr_1.3.0                
## [143] DelayedMatrixStats_1.16.0   rmarkdown_2.26             
## [145] MatrixGenerics_1.6.0        Rtsne_0.16                 
## [147] spatstat.explore_3.0-5      Biobase_2.54.0             
## [149] shiny_1.7.3